Deep Learning
1. Forward and backwards pass
Based on the DL08 slide uploaded in the yscec, we will do studies for implementing forward and backwards pass.
(a) Complete the code
Complete forward and backpropagation code based on the DL08 slide.
Initialization
Assign initial values based on the DL08 slide.
w1 = 0.15
w2 = 0.20
w3 = 0.25
w4 = 0.30
w5 = 0.40
w6 = 0.45
w7 = 0.50
w8 = 0.55
w = c(w1, w2, w3, w4, w5, w6, w7, w8)
b1 = 0.35
b2 = 0.60
b = c(b1, b2)
Here is the input and target values
Set learning rate
Functions
A sigmoid
function is a mathematical function and it can be expressed as below.
A forwardprop
function works for the forward propagation.
forwardProp = function(input, w, b){
# input to hidden layer
neth1 = w[1]*input[1] + w[2]*input[2] + b[1]
neth2 = w[3]*input[1] + w[4]*input[2] + b[1]
outh1 = sigmoid(neth1)
outh2 = sigmoid(neth2)
# hidden layer to output layer
neto1 = w[5]*outh1 + w[6]*outh2 + b[2]
neto2 = w[7]*outh1 + w[8]*outh2 + b[2]
outo1 = sigmoid(neto1)
outo2 = sigmoid(neto2)
res = c(outh1, outh2, outo1, outo2)
return(res)}
A error
function calculates mean squared error.
iterations
To draw error rate figure, I will store a error value from each iteration of a for loop in the err
list.
To see the changes in parameters, , I will store weight values from each iteration of a for loop in the wei
list.
To see the prediction results within 1,000 iterations, I will store output values from each iteration of a for loop in the output
list.
Set the number of iterations.
Repeat below for number of iterations
for (i in 1:iteration) {
res = forwardProp(input, w, b)
output[i] = list(cbind(res[3], res[4]))
outh1 = res[1]; outh2 = res[2]; outo1 = res[3]; outo2 = res[4]
#compute dE_dw5
dE_douto1 = -( out[1] - outo1 )
douto1_dneto1 = outo1*(1-outo1)
dneto1_dw5 = outh1
dE_dw5 = dE_douto1*douto1_dneto1*dneto1_dw5
#compute dE_dw6
dneto1_dw6 = outh2
dE_dw6 = dE_douto1*douto1_dneto1*dneto1_dw6
#compute dE_dw7
dE_douto2 = -( out[2] - outo2 )
douto2_dneto2 = outo2*(1-outo2)
dneto2_dw7 = outh1
dE_dw7 = dE_douto2*douto2_dneto2*dneto2_dw7
#compute dE_dw8
dneto2_dw8 = outh2
dE_dw8 = dE_douto2*douto2_dneto2*dneto2_dw8
#compute dE_db2
dE_db2 = dE_douto1*douto1_dneto1*1+dE_douto2*douto2_dneto2*1
#compute dE_douth1
dneto1_douth1 = w5
dneto2_douth1 = w7
dE_douth1 = dE_douto1*douto1_dneto1*dneto1_douth1 + dE_douto2*douto2_dneto2*dneto2_douth1
#compute dE_dw1
douth1_dneth1 = outh1*(1-outh1)
dneth1_dw1 = input[1]
dE_dw1 = dE_douth1*douth1_dneth1*dneth1_dw1
#compute dE_dw2
dneth1_dw2 = input[2]
dE_dw2 = dE_douth1*douth1_dneth1*dneth1_dw2
#compute dE_douth2
dneto1_douth2 = w6
dneto2_douth2 = w8
dE_douth2 = dE_douto1*douto1_dneto1*dneto1_douth2 + dE_douto2*douto2_dneto2*dneto2_douth2
#compute dE_dw3
douth2_dneth2 = outh2*(1-outh2)
dneth2_dw3 = input[1]
dE_dw3 = dE_douth2*douth2_dneth2*dneth2_dw3
#compute dE_dw4
dneth2_dw4 = input[2]
dE_dw4 = dE_douth2*douth2_dneth2*dneth2_dw4
#compute dE_db1
dE_db1 = dE_douto1*douto1_dneto1*dneto1_douth1*douth1_dneth1*1 +
dE_douto2*douto2_dneto2*dneto2_douth2*douth2_dneth2*1
#update all parameters via a gradient descent
w1 = w1 - gamma*dE_dw1
w2 = w2 - gamma*dE_dw2
w3 = w3 - gamma*dE_dw3
w4 = w4 - gamma*dE_dw4
w5 = w5 - gamma*dE_dw5
w6 = w6 - gamma*dE_dw6
w7 = w7 - gamma*dE_dw7
w8 = w8 - gamma*dE_dw8
b1 = b1 - gamma*dE_db1
b2 = b2 - gamma*dE_db2
w = c(w1, w2, w3, w4, w5, w6, w7, w8)
b = c(b1, b2)
err[i] = error(res, out)
wei[i] = list(w)}
Full code
w1 = 0.15; w2 = 0.20; w3 = 0.25; w4 = 0.30; w5 = 0.40; w6 = 0.45; w7 = 0.50; w8 = 0.55
w = c(w1, w2, w3, w4, w5, w6, w7, w8)
b1 = 0.35; b2 = 0.60r
b = c(b1, b2)
input1 = 0.05; input2 = 0.10
input = c(input1, input2)
out1 = 0.01; out2 = 0.99
out = c(out1, out2)
gamma = 0.5
sigmoid = function(z){return( 1/(1+exp(-z) ))}
forwardProp = function(input, w, b){
neth1 = w[1]*input[1] + w[2]*input[2] + b[1]
neth2 = w[3]*input[1] + w[4]*input[2] + b[1]
outh1 = sigmoid(neth1)
outh2 = sigmoid(neth2)
neto1 = w[5]*outh1 + w[6]*outh2 + b[2]
neto2 = w[7]*outh1 + w[8]*outh2 + b[2]
outo1 = sigmoid(neto1)
outo2 = sigmoid(neto2)
res = c(outh1, outh2, outo1, outo2)
return(res)}
error = function(res, out){
err = 0.5*(out[1] - res[3])^2 + 0.5*(out[2] - res[4])^2
return(err)}
iteration = 1000
err <- c()
wei <- c()
#output <- c()
for (i in 1:iteration)) {
res = forwardProp(input, w, b)
output[i] = list(cbind(res[3], res[4]))
outh1 = res[1]; outh2 = res[2]; outo1 = res[3]; outo2 = res[4]
#compute dE_dw5
dE_douto1 = -( out[1] - outo1 )
douto1_dneto1 = outo1*(1-outo1)
dneto1_dw5 = outh1
dE_dw5 = dE_douto1*douto1_dneto1*dneto1_dw5
#compute dE_dw6
dneto1_dw6 = outh2
dE_dw6 = dE_douto1*douto1_dneto1*dneto1_dw6
#compute dE_dw7
dE_douto2 = -( out[2] - outo2 )
douto2_dneto2 = outo2*(1-outo2)
dneto2_dw7 = outh1
dE_dw7 = dE_douto2*douto2_dneto2*dneto2_dw7
#compute dE_dw8
dneto2_dw8 = outh2
dE_dw8 = dE_douto2*douto2_dneto2*dneto2_dw8
#compute dE_db2
dE_db2 = dE_douto1*douto1_dneto1*1+dE_douto2*douto2_dneto2*1
#compute dE_douth1
dneto1_douth1 = w5
dneto2_douth1 = w7
dE_douth1 = dE_douto1*douto1_dneto1*dneto1_douth1 + dE_douto2*douto2_dneto2*dneto2_douth1
#compute dE_dw1
douth1_dneth1 = outh1*(1-outh1)
dneth1_dw1 = input[1]
dE_dw1 = dE_douth1*douth1_dneth1*dneth1_dw1
#compute dE_dw2
dneth1_dw2 = input[2]
dE_dw2 = dE_douth1*douth1_dneth1*dneth1_dw2
#compute dE_douth2
dneto1_douth2 = w6
dneto2_douth2 = w8
dE_douth2 = dE_douto1*douto1_dneto1*dneto1_douth2 + dE_douto2*douto2_dneto2*dneto2_douth2
#compute dE_dw3
douth2_dneth2 = outh2*(1-outh2)
dneth2_dw3 = input[1]
dE_dw3 = dE_douth2*douth2_dneth2*dneth2_dw3
#compute dE_dw4
dneth2_dw4 = input[2]
dE_dw4 = dE_douth2*douth2_dneth2*dneth2_dw4
#compute dE_db1
dE_db1 = dE_douto1*douto1_dneto1*dneto1_douth1*douth1_dneth1*1 +
dE_douto2*douto2_dneto2*dneto2_douth2 *douth2_dneth2*1
#update all parameters via a gradient descent
w1 = w1 - gamma*dE_dw1; w2 = w2 - gamma*dE_dw2; w3 = w3 - gamma*dE_dw3; w4 = w4 - gamma*dE_dw4
w5 = w5 - gamma*dE_dw5; w6 = w6 - gamma*dE_dw6; w7 = w7 - gamma*dE_dw7; w8 = w8 - gamma*dE_dw8
b1 = b1 - gamma*dE_db1; b2 = b2 - gamma*dE_db2
w = c(w1, w2, w3, w4, w5, w6, w7, w8)
b = c(b1, b2)
err[i] = error(res, out)
wei[i] = list(w)}
(b) update weights
By using your code in 1(a)
, update weight parameters from 1 iteration of forward and backpropagation.
Updated parameters were stored in the wei
list. So we can derive updated weights from 1 iteration of forward and backpropagation by wei[[1]]
. There are some changes in weight parameters compare to the initial values.
W1 | W2 | W3 | W4 | W5 | W6 | W7 | W8 | |
---|---|---|---|---|---|---|---|---|
given | 0.15 | 0.2 | 0.25 | 0.3 | 0.4 | 0.45 | 0.5 | 0.55 |
1st | 0.1497807 | 0.1995614 | 0.2497511 | 0.2995023 | 0.3589165 | 0.4086662 | 0.5113013 | 0.5613701 |
(c) different learning rates
Try 3 different learning rates(\(\gamma\) = 0.1, \(\gamma\) = 0.6, \(\gamma\) = 1.2), and repeat forward/back propagation 10,000 iterations. Draw error rate figure, and calculate prediction results as in the page 37-38 of the DL08 slide. Discuss about the convergence of neural network fittings based on different learning rates.
For different learning rates, we can set the gamma
parameter differently in the code. And for different iteration number, we can apply iteration = 10000
function.
error rate fig
par(mar=c(4.5, 4.1, 1.5, 2.1))
plot(1:iteration, err1, col="blue", type="l", lty=1, ylab="Error", xlab = "Iteration : 10,000")
lines(1:iteration, err2, col="red")
lines(1:iteration, err3, col="green")
sub = c("learning rate = 0.1", "learning rate = 0.6", "learning rate = 1.2")
legend(7500, 0.29, legend = sub, col=c("blue","red","green"), lty=1, cex=0.8, bg='aliceblue', lwd=10)
prediction results
To get differently predicted values through different learning rates, output
lists were used. output1
, output2
and output3
lists represent 0.1, 0.6 and 1.2 rates respectively.
learning rate = 0.1 | learning rate = 0.6 | learning rate = 1.2 | True Values | |
---|---|---|---|---|
out1 | 0.0217111 | 0.0117727 | 0.0104678 | 0.01 |
out2 | 0.9784156 | 0.9882453 | 0.9895359 | 0.99 |
convergence
Most of the errors become similar at the very left side of the plot. As a result, the differences in the errors of the different learning rates are compressed, making these differences more difficult to judge.
To respond to skewness, therefore, I used logarithmic scales in the figure[1] below. Or, it can be useful to see the first 1,000 iterations in the figure[2] below. Although, every errors from three different learning rate parameters converged well as iteration goes on, it seems obvious that the larger parameters a model has, the better it is.
Also, It can be shown in the previous prediction results too. Although, every three models make prediction well compared to the true values, 0.01, 0.99, it has the closest prediction when model uses the largest learning rate, \(\gamma=1.2\), among them.
This is clear, considering the fact that the more iterations are performed, the more precise parameters become and the larger step size usually helps for the model to move quickly, which makes similar effects on the convergence as the large number of iterations do.
However, It does not mean that always larger step size or learning rate is a good hyeprparameter for models. Too large learning rate can cause the model to converge too quickly to a suboptimal solution, whereas a learning rate that is too small can cause the process to get stuck. Therefore, carefully selecting the learning rate is one of the most important thing in the neural network.
2. perceptron
Based on the DL09 slide uploaded in the yscec, we will do studies for implementing a linear separable classifier (perceptron) for the iris data set.
(a) complete the code
Complete perceptron code for classifying setosa and Virginica based on the DL09 slide.
Description
Here only 2 different species,setosa & Virginica
, in the iris data set will be used and sepal & petal
length will be the observed values to distinguish between them. Below is a scatter plot showing that setosa and Virginica can be distinguished by a single linear equation.
Initalization
Required data is as follows.
data(iris)
iris_sub = iris[-c(51:100), c(1,3,5)]
names(iris_sub) = c("sepal", "petal", "species")
x = iris_sub[, c(1,2)]
y = ifelse(iris_sub[,3] == 'setosa', 1, -1)
Initalize weights, bias and k
parameters. k
value is to count updates.
Set learning rate to 0.5.
Functions
To distinguish between setosa and Virginica, the sign of \(w_1*(Sepal\quad length)+w_2*(Petal\quad length)\)d will be used.
distance.from.plane = function(z,w,b){sum(z*w)+b}
classify.linear = function(x,w,b){
distances = apply(x, 1, distance.from.plane, w, b)
return(ifelse(distances <0, -1, +1))}
Also, to update bias parmeter, b
, \(max(||x_i||)\) will be used.
Therefore, the perception
function is defined as follows.
perceptron = function(x, y, learning.rate){
w = c(0,0)
b = 0
k = 0
R = max(apply(x, 1, euclidean.norm))
mark.complete = TRUE
while (mark.complete) {
mark.complete = FALSE
yc = classify.linear(x,w,b)
for (i in 1:nrow(x)) {
if (y[i] != yc[i]) {
w = w + learning.rate * y[i] * x[i,]
b = b + learning.rate * y[i] * R^2
k = k+1
mark.complete=TRUE
}
}
}
return(list(w,b,k))
}
Full code
data(iris)
iris_sub = iris[-c(51:100), c(1,3,5)]
names(iris_sub) = c("sepal", "petal", "species")
x = iris_sub[, c(1,2)]
y = ifelse(iris_sub[,3] == 'setosa', -1, 1)
learning.rate = 0.5
euclidean.norm = function(x) {sqrt(sum(x * x))}
distance.from.plane = function(z,w,b){sum(z*w)+b}
classify.linear = function(x,w,b){
distances = apply(x, 1, distance.from.plane, w, b)
return(ifelse(distances <0, -1, +1))}
perceptron = function(x, y, learning.rate){
w = c(0,0)
b = 0
k = 0
R = max(apply(x, 1, euclidean.norm))
mark.complete = TRUE
while (mark.complete) {
mark.complete = FALSE
yc = classify.linear(x,w,b)
for (i in 1:nrow(x)) {
if (y[i] != yc[i]) {
w = w + learning.rate * y[i] * x[i,]
b = b + learning.rate * y[i] * R^2
k = k+1
mark.complete=TRUE
}
}
}
return(list(w,b,k))
}
results = perceptron(x, y, learning.rate)
(b) Report results
Report your estimated weight and bias parameters. How many iterations are required for your update?
## [[1]]
## sepal petal
## 1 124.85 516.9
##
## [[2]]
## [1] -1710.4
##
## [[3]]
## [1] 568
sepal | petal | bias | iterations |
---|---|---|---|
124.85 | 516.9 | -1710.4 | 568 |
(c) Draw linear classifier
Draw your linear classifier for setosa and Virginica as in the page 18 of the DL09 slide.
linear classifier : \[ b + w_1x_1 + w_2x_2 = 0 \]
par(mar=c(6, 4.1, 3, 2.1))
p <- ggplot(iris_sub, aes(x=sepal, y=petal))+
geom_point(aes(colour = species, shape = species), size = 3) +
xlab("Sepal length") +
ylab("Petal length") +
ggtitle("Species VS Sepal and Petal lengths")
p+geom_abline(intercept=unlist(-results[[2]]/results[[1]][2]), slope=unlist(-results[[1]][1]/results[[1]][2]))